********************************************************************************
*Data construction for Bingley and Cappellari 
**********************************************************************************
clear all
set more off
set mem 5000m
cap log close
log using inspection.log, replace 

*dataset with reconstructed families of men and incomes in wide, generated by "triplet-data-creation-20160706.do"
use earnwide-triplet-father-1935-1969-son-1955-1989

de

ge bob = border /*birth order 0=father, 1=son 1, 2=son2, and so on*/

ge yob = year(dob)
ge mob = month(dob)

ta  yob bob


drop if bob >2 									//drop third kids and onwards//


*selections on age and age spacing of sons **************************************
sort pnrf bob 									//pnrf is the family identifier//
ge yobf=yob if bob==0		
ge mobf = mob if bob==0
sort pnrf yobf
by pnrf: replace yobf=yobf[1] if yobf==.
by pnrf: replace mobf=mobf[1] if mobf==.

sort pnrf bob 
ge yobs1=yob if bob==1
ge mobs1 = mob if bob==1
sort pnrf yobs1
by pnrf: replace yobs1=yobs1[1] if yobs1==.
by pnrf: replace mobs1=mobs1[1] if mobs1==.

sort pnrf bob 
ge yobs2=yob if bob==2
ge mobs2=mob if bob==2
sort pnrf yobs2
by pnrf: replace yobs2=yobs2[1] if yobs2==.
by pnrf: replace mobs2=mobs2[1] if mobs2==.

ta  yob bob
drop if yobs1 < 1959 | yobs1 > 1985		
*too few first sons born before 1959, and they are too young (to enter profile at 25 and have 5 yrs data) if born after 1985 

ge agedifffs1= 12*yobs1 + mobs1 - (12*yobf + mobf) //drop families with age at first birth less than 18//
drop if agedifffs1 <216 
ta  yob bob

ge agediffs1s2= 12*yobs2 + mobs2 - (12*yobs1 + mobs1) //keep families with brother age spacing 1-12 yrs//
drop if agediffs1s2<12 
drop if  agediffs1s2>144 & agediffs1s2!=. & bob==2

ta  yob bob

drop if yob < 1962 & bob==2			//drop 2nd sons too old or too young 
drop if yob > 1985 & bob==2

ta  yob bob

drop agediff* yobs* yobf mobs* mobf

save fsearnwide_b,replace

forv i =  1980(1)2014 { 		//convert to long format
	use fsearnwide_b
	keep pnr pnrf yob mob bob earn`i' 
	ge year = `i'
	rename earn`i' earnings
	save  earnings`i', replace            
}

use earnings1980, clear
forv i =  1981(1)2014 { 
	append using earnings`i'
	erase earnings`i'.dta
}


save earnings_long, replace
erase earnings1980.dta
	
ge cohort=yob					//form 3-year cohort groupings
forv y = 1936(3)1984 {
	local do = `y' - 1
	local up = `y' + 1
		replace cohort =  `y' if yob== `do'|yob== `up'
}



ge age=year-yob
ge ageco=year-cohort

drop if ageco <25 | ageco > 60



*EARNINGS TRIMMINGS
drop if earnings==.		//drop individual observations with missing earnings
drop if earnings <=0	//drop negative earnings


*trimming out the lower and upper 0.5% of the earnings distribution of each member
forv b =  0(1)2 {
	forv i=1980(1)2014 {
		_pctile earnings if year==`i' & bob==`b', nq(200)
		drop if earnings>r(r199) & year==`i' & bob==`b'
		drop if earnings<r(r1) & year==`i' & bob==`b'
	}
}


*imposing at least 5 consecutive obs on wages
sort pnr year
by pnr: ge consec=year[_n]==year[_n-1]+1 
by pnr: ge sconsec=sum(consec)
ge sconsec1=sconsec
by pnr: replace sconsec1=sconsec-sconsec[_n-1] if consec[_n-1]==0
by pnr: replace sconsec1=sconsec1[_n-1]+1  if sconsec1>sconsec1[_n-1]+1
by pnr: replace sconsec1=0  if sconsec==sconsec[_n-1]

ge nconsec5=sconsec1>=4
by pnr: ge snconsec5=sum(nconsec5)
by pnr: ge tconsec5=snconsec5[_N]
drop if tconsec5==0 


drop consec* sconse* nconse* 
drop snconsec* tconsec*



*Household structure******************************************************************


sort pnrf bob year
by pnrf: ge bob1=bob[1]			//gives the bob of the first dynasty member observed//
by pnrf: ge bobN=bob[_N]			//gives the bob of the last dynasty member observed//



by pnrf: ge bobhole=bob[_n]==2&bob[_n-1]==0
mvencode bobhole, mv(0) o
sort pnrf bobhole
by pnrf: replace bobhole=bobhole[_N]			//tells if there is any missing first son in dynasties that have son 2//
*************************************************************************************

keep if bob1==0 & bobN>0&bobhole==0   //only families where there is  the father and the first son//
bysort pnr: ge prime=_n==1
bysort pnrf: ge primedyn=_n==1

ta bob if prime

ge logearn=log(earnings)
save , replace

compress
ge ageA=ageco-25			// age of the cohort in deviation from 25
qui summ ageA
ta ageA bob
ge ageC=age



save,replace

sort pnrf bob year

*because there are too few fathers and matches in the youngest cohort of fathers
*we drop those families
by pnrf : ge yobF = yob[1]
drop if yobF > 1961
drop yobF
save, replace

*finally, keep only complete triads

sort pnrf bob
cap drop bobN
by pnrf:ge bobN = bob[_N]
keep if bobN==2
ta bob prime


save, replace


log close




